Exercises from Udacity’s Exploratory Data Analysis in R MOOC

Set up packages, chart themes etc

#set up chart theme function and load required packages
chart.theme.size = 14
#theme_set(theme_minimal(chart.theme.size)) #minimal theme as default
source("../r_config/r_env_setup.R") #load packages and cutom ggplot theme function and presets
primary_color = color_primary_pal
highlight_color = color_highlight_pal
set_ggplot_theme01 <- do.call(chart_theme_minimal, as.list(chart_format_default))

Reddit dataset

#load the reddit dataset
reddit_df <- read.csv("../data/reddit.csv", header = T,
                      stringsAsFactors=TRUE, sep=",",
                      nrow = 1000)

#Number of rows and columns
dim(reddit_df)
## [1] 1000   14
#List of variables
names(reddit_df)
##  [1] "id"                "gender"            "age.range"        
##  [4] "marital.status"    "employment.status" "military.service" 
##  [7] "children"          "education"         "country"          
## [10] "state"             "income.range"      "fav.reddit"       
## [13] "dog.cat"           "cheese"
#structure of the dataset
str(reddit_df)
## 'data.frame':    1000 obs. of  14 variables:
##  $ id               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ gender           : int  0 0 1 0 1 0 0 0 0 0 ...
##  $ age.range        : Factor w/ 7 levels "18-24","25-34",..: 2 2 1 2 2 2 2 1 3 2 ...
##  $ marital.status   : Factor w/ 5 levels "Engaged","Forever Alone",..: NA NA NA NA NA 4 3 4 4 3 ...
##  $ employment.status: Factor w/ 6 levels "Employed full time",..: 1 1 2 2 1 1 1 4 1 2 ...
##  $ military.service : Factor w/ 2 levels "No","Yes": NA NA NA NA NA 1 1 1 1 1 ...
##  $ children         : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ education        : Factor w/ 7 levels "Associate degree",..: 2 2 5 2 2 2 5 2 2 5 ...
##  $ country          : Factor w/ 60 levels "Australia","Barbados",..: 56 56 56 56 56 56 20 56 56 20 ...
##  $ state            : Factor w/ 46 levels "","Alabama","Alaska",..: 32 32 44 32 6 32 1 6 32 1 ...
##  $ income.range     : Factor w/ 8 levels "$100,000 - $149,999",..: 2 2 8 2 7 2 NA 7 2 7 ...
##  $ fav.reddit       : Factor w/ 244 levels "","4chan","adviceanimals",..: 97 94 203 206 21 94 178 76 222 1 ...
##  $ dog.cat          : Factor w/ 3 levels "I like cats.",..: NA NA NA NA NA 2 2 2 1 1 ...
##  $ cheese           : Factor w/ 11 levels "American","Brie",..: NA NA NA NA NA 3 3 1 10 7 ...
#levels of age range
levels(reddit_df$age.range)
## [1] "18-24"       "25-34"       "35-44"       "45-54"       "55-64"      
## [6] "65 or Above" "Under 18"
#count summary of age range
table(reddit_df$age.range)
## 
##       18-24       25-34       35-44       45-54       55-64 65 or Above 
##         477         384          50          11           5           2 
##    Under 18 
##          69
#barblot for age range. Selective coloring of age ranges where count > 300
ggplot(data = reddit_df, aes(x = age.range)) +
  #geom_bar_custom +
  geom_bar(aes(fill = ..count.. > 300)) +
  scale_fill_brewer(palette="Set1")  +
  #geom_hline(yintercept=0, size=0.75, color="grey") +
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) +
  chart_theme_bar +
  chart_footer 
chart_footnote("Source: Sample Reddit data\nNote*: <add text>")

#p1<-
ggplot(data = reddit_df, aes(x = age.range)) +
  geom_bar_custom +
  #geom_hline(yintercept=0, size=0.75, color="grey") +
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) +
  chart_theme_bar +
  chart_footer
chart_footnote("Source: Sample Reddit data\nNote*: <add text>")

#ggsave("random_data.png", g, width=12, height=9)


#Notice that under18 bucket is at the end which is non-intuitive

Order a factor variable

reddit_df$age.range <- ordered(reddit_df$age.range, 
                             levels=c("Under 18", "18-24", "25-34", "35-44", "45-54", 
                                      "55-64", "65 or Above" ))

#or alternatively
reddit_df$age.range <- factor(reddit_df$age.range, 
                               levels=c("Under 18", "18-24", "25-34", "35-44", "45-54", 
                                        "55-64", "65 or Above" ),
                               ordered = TRUE)

#barblot for age range after ordering the factor variable
ggplot(data = reddit_df, aes(x = age.range)) +
  do.call(chart_theme_minimal, as.list(chart_format_bar2)) +
  geom_bar(colour = primary_color[1], fill = primary_color[1], alpha = 0.5, size = 0.25) +
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) +
  geom_hline(yintercept=0, size=0.75, color="grey") 

#Notice that under18 bucket is placed as the first bar now

Pseudo Facebook Dataset

#read the dataset into R
fb_df <- read.csv("../data/pseudo_facebook.tsv", header = TRUE,
                  stringsAsFactors = FALSE, sep="\t")

#take a peek into the fb data
glimpse(fb_df)
## Observations: 99003
## Variables:
## $ userid                (int) 2094382, 1192601, 2083884, 1203168, 1733...
## $ age                   (int) 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, ...
## $ dob_day               (int) 19, 2, 16, 25, 4, 1, 14, 4, 1, 2, 22, 1,...
## $ dob_year              (int) 1999, 1999, 1999, 1999, 1999, 1999, 2000...
## $ dob_month             (int) 11, 11, 11, 12, 12, 12, 1, 1, 1, 2, 2, 2...
## $ gender                (chr) "male", "female", "male", "female", "mal...
## $ tenure                (int) 266, 6, 13, 93, 82, 15, 12, 0, 81, 171, ...
## $ friend_count          (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ friendships_initiated (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ likes                 (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ likes_received        (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ mobile_likes          (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ mobile_likes_received (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ www_likes             (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ www_likes_received    (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...

Create custom color scale for factor variables

#Create a custom color scale
myColors <- brewer.pal(3,"Set1")

fb_df$gender = factor(fb_df$gender)

#assign factor levels as names to the custom color scale
names(myColors) <- levels(fb_df$gender)

#create an alias for the custom color / fill scale
colScalegender <- scale_colour_manual(name = "gender",values = myColors)
fillScalegender <- scale_fill_manual(name = "gender",values = myColors)

Facebook data - EDA

#histogram of b'days
ggplot(data = fb_df, aes(x = dob_day)) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_histogram(colour = "white", fill = primary_color[1], alpha = 0.4, size = 0.25) +
  xlab("Day of birth") + ylab("# of members") +
  ggtitle("Member Distribution by Day of Birth") +
  scale_x_discrete(breaks = seq(1,31, 2)) +
  scale_y_continuous(labels = comma) +
  chart_footer +
  geom_hline(yintercept=0, size=0.75, color="grey") +
  geom_vline(xintercept=1, size=0.75, color = primary_color[1]) +
  annotate("text", x = 2, y = 6500, 
           label = 
             "Abnormally high counts for 1st day of the month\nSeveral member may be selecting the first\ndropdown choice when selecting the day of birth", 
           size = text.size,color = primary_color[1],
           hjust = 0) 
chart_footnote()

#faceted by month
#histogram of b'days
ggplot(data = fb_df, aes(x = dob_day)) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_histogram(colour = "white", fill = primary_color[1], alpha = 0.4, size = 0.25) +
  xlab("Day of birth") + ylab("# of members") +
  ggtitle("Member Distribution by Day of Birth faceted by Month") +
  scale_y_continuous(labels = comma) +
  scale_x_discrete(breaks = seq(1,31, 8)) +
  geom_hline(yintercept=0, size=0.75, color="grey")+
  facet_wrap(~dob_month, ncol = 4)
chart_footnote()

#faceted by month and using density (area under each plot is 1) and free scales
#histogram of b'days
ggplot(data = fb_df, aes(x = dob_day)) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_histogram(aes(y=..density..), colour = "white", 
                 fill = primary_color[1], alpha = 0.4, size = 0.25) +
  scale_y_continuous(labels = percent) +
  xlab("Day of birth") + ylab("# of members") +
  ggtitle("Member Distribution by Day of Birth faceted by Month") +
  scale_x_discrete(breaks = seq(1,31, 8)) +
  geom_hline(yintercept=0, size=0.75, color="grey")+
  facet_wrap(~dob_month, scales = "free", ncol = 4)

#Histogram of friend count (notice that data is concentrated at Zero with few outliers)
ggplot(data = fb_df, aes(x=friend_count)) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_histogram(colour = "white", fill = primary_color[1], alpha = 0.4, size = 0.25) +
  scale_y_continuous(labels = comma) + scale_x_continuous(labels = comma) 

#Friend count after zooming into the previous chart using coord_cartesian
ggplot(data = fb_df, aes(x=friend_count)) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_histogram(colour = "white", fill = primary_color[1], alpha = 0.4, size = 0.25) +
  scale_y_continuous(labels = comma) + 
  scale_x_continuous(labels = comma) +
  coord_cartesian(xlim= c(0, 1000))

#Friend count after limiting the data using limits
ggplot(data = fb_df, aes(x=friend_count)) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_histogram(colour = "white", fill = primary_color[1], alpha = 0.4, size = 0.25) +
  scale_y_continuous(labels = comma) + 
  scale_x_continuous(labels = comma, limits= c(0, 1000)) 

#Friend count after limiting the data using limits and binwidth = 25, and breaks at 50
ggplot(data = fb_df, aes(x=friend_count)) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_histogram(binwidth = 25, colour = "white", fill = primary_color[1], 
                 alpha = 0.4, size = 0.25) +
  scale_y_continuous(labels = comma) + 
  scale_x_continuous(labels = comma, 
                     limits= c(0, 1000), breaks = seq(0, 1000, 100)) 

#Friend count faceted by gender
ggplot(data = fb_df, aes(x=friend_count)) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_histogram(binwidth = 25, colour = "white", fill = primary_color[1], 
                 alpha = 0.4, size = 0.25) +
  scale_y_continuous(labels = comma) + 
  scale_x_continuous(labels = comma, 
                     limits= c(0, 1000), breaks = seq(0, 1000, 300)) +
  facet_wrap(~gender)

#Friend count fill by gender
ggplot(data = filter(fb_df, !is.na(gender)),  # remove records where gender is NA
       aes(x=friend_count)) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_density(aes(group = gender, fill = gender, colour = gender),
                 binwidth = 25, #colour = "white", #fill = primary_color[1], 
                 alpha = 0.1, size = 0.25) +
  scale_y_continuous() + 
  scale_x_continuous(labels = comma, 
                     limits= c(0, 1000), breaks = seq(0, 1000, 100))

#Friend count fill by gender using specified color palettes
ggplot(data = na.omit(fb_df),  # remove records where any field is NA
       aes(x=friend_count)) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_density(aes(group = gender, fill = gender, colour = gender),
                 binwidth = 25, #colour = "white", #fill = primary_color[1], 
                 alpha = .2, size = .25) +
  #scale_fill_manual(values=highlight_color) +  # for manually defined palette
  #scale_color_manual(values=highlight_color) +
  scale_fill_brewer(palette = "Set1") +
  scale_color_brewer(palette = "Set1") +
  scale_y_continuous() + 
  scale_x_continuous(labels = comma, 
                     limits= c(0, 1000), breaks = seq(0, 1000, 100))

#count of levels in gender
fb_df %>%
  group_by(gender) %>%
  summarise(count = n())
## Source: local data frame [3 x 2]
## 
##   gender count
## 1 female 40254
## 2   male 58574
## 3     NA   175
#min value of each variable by gender 
fb_df %>%
  na.omit() %>% #filter out rows with missing values in any variable
  group_by(gender) %>%
  summarise_each(funs(min))
## Source: local data frame [2 x 15]
## 
##   gender  userid age dob_day dob_year dob_month tenure friend_count
## 1 female 1000008  13       1     1900         1      0            0
## 2   male 1000038  13       1     1900         1      0            0
## Variables not shown: friendships_initiated (int), likes (int),
##   likes_received (int), mobile_likes (int), mobile_likes_received (int),
##   www_likes (int), www_likes_received (int)
#Histogram for tenure
ggplot(data = fb_df, aes(x = tenure/365)) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_histogram(binwidth = 1/4, colour = "white", fill = primary_color[1], 
                 alpha = .3, size = .25) +
  xlab("Tenure (years)") +
  ylab("# of Facebook members") +
  ggtitle("Distribution of members by tenure") +
  scale_y_continuous(labels = comma) + 
  scale_x_continuous(labels = comma, 
                     limits= c(0, 7), breaks = seq(0, 7, 1)) +
  geom_hline(yintercept=0, size=0.75, color="grey") +
  annotate("text", x = Inf, y = -Inf, label = "sumitbajaj.me",
           hjust=1.1, vjust= -.5, col="gray", cex=4, alpha = 0.8)

#histogram for user age
ggplot(data = fb_df, aes(x = age)) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_histogram(colour = "white", fill = primary_color[1],
                 alpha = 0.3, size = .25) +
  geom_hline(yintercept=0, size = 0.75, color = "grey") +
  scale_y_continuous(labels = comma) +
  ggtitle("Distribution of members by Age") +
  xlab("member age") +
  ylab("# of Facebook members")

#histogram for user age with adjusted binwidth
#A binwidth of 1 allows us to visualize any unusual spikes in the data
ggplot(data = fb_df, aes(x = age)) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_histogram(colour = "white", fill = primary_color[1],
                 alpha = 0.3, size = .25,
                 binwidth = 1) +
  geom_hline(yintercept=0, size = 0.75, color = "grey") +
  scale_y_continuous(labels = comma) +
  ggtitle("Distribution of members by Age") +
  xlab("member age") +
  ylab("# of Facebook members") +
  scale_x_discrete(labels = comma, breaks = seq(min(fb_df$age), max(fb_df$age), 10))

Transforming data - Log, Sqrt etc

summary(fb_df$friend_count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    31.0    82.0   196.4   206.0  4923.0
#Log transform for the # of likes
summary(log10(fb_df$friend_count))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    -Inf       1       2    -Inf       2       4
#Log transform for the # of likes +1 to avoid infinity at log zero
summary(log10(fb_df$friend_count + 1))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.505   1.919   1.868   2.316   3.692
# visualize friend count 
p1 <-
ggplot(data = fb_df, aes(x = friend_count)) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_histogram(colour = "white", fill = primary_color[1],
                 alpha = 0.3, size = .25) +
  scale_y_continuous(labels = comma) +
  scale_x_continuous(labels = comma)

#visualize friend count with log
p2 <-
ggplot(data = fb_df, aes(x = log10(friend_count + 1))) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_histogram(colour = "white", fill = primary_color[1],
                 alpha = 0.3, size = .25) +
  scale_y_continuous(labels = comma)

#visualize friend count with square root
p3 <-
ggplot(data = fb_df, aes(x = sqrt(friend_count))) +
  do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
  geom_histogram(colour = "white", fill = primary_color[1],
                 alpha = 0.3, size = .25) +
  scale_y_continuous(labels = comma)

grid.arrange(p1, p2, p3, ncol = 1)

#alternate way for log scale P2. Note that the x axis label doesn't reflect log
p1 + scale_x_log10()

Frequency polygons for friend_count

Note that sum(..count..) will sum across color, so the percentages displayed are percentages of total users. To plot percentages within each group, you can try y = ..density…

ggplot(data = filter(fb_df, !is.na(gender)), 
       aes(x = friend_count)) +
  do.call(chart_theme_minimal, as.list((chart_format_hist_no_vgrid["fsize"] = 14))) + 
  geom_histogram(aes(fill = gender),
                 binwidth = 10,
                 color = 'white',
                 alpha = 0.5, size = 0.25) +
  fillScalegender +
  scale_x_continuous(labels = comma,
                     lim = c(0, 1000)) +
  scale_y_continuous(labels = comma) +
  theme(legend.position = "right")

# frequency polygon - friend count
ggplot(data = filter(fb_df, !is.na(gender)), 
       aes(x = friend_count,
           y = ..count../sum(..count..))) +
  do.call(chart_theme_minimal, as.list((chart_format_hist_no_vgrid["fsize"] = 14))) +
  geom_freqpoly(aes(color = gender, fill = gender),
                binwidth = 10,
                alpha = 1, size = 1.5) +
  colScalegender +
  #scale_color_brewer(palette = "Set1") +
  scale_y_continuous(labels = percent) +
  scale_x_continuous(labels = comma,
                     lim = c(0, 1000)) +
  theme(legend.position = "right")

Feature usage - Likes males vs females

# frequency polygon - www_likes
p1 <-
ggplot(data = filter(fb_df, !is.na(gender)), 
       aes(x = www_likes)) +
  do.call(chart_theme_minimal, as.list((chart_format_hist_no_vgrid["fsize"] = 14))) +
  geom_freqpoly(aes(color = gender, fill = gender),
                #binwidth = 10,
                alpha = 1, size = .5) +
  colScalegender +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma) +
  theme(legend.position = "right")

print(p1)

p1 + scale_x_log10() +
  ylab("Log10Count")

#Who gets more likes males or females?
by(fb_df$www_likes, fb_df$gender, sum)
## fb_df$gender: female
## [1] 3507665
## -------------------------------------------------------- 
## fb_df$gender: male
## [1] 1430175
fb_df %>%
  group_by(gender) %>%
  summarise(num_www_likes = sum(www_likes),
            n=n(),
            likes_per_person = num_www_likes/n()) 
## Source: local data frame [3 x 4]
## 
##   gender num_www_likes     n likes_per_person
## 1 female       3507665 40254         87.13830
## 2   male       1430175 58574         24.41655
## 3     NA          8590   175         49.08571
#Ans: Females get more likes overall and average female get more than 3x the # of male likes

#table <- xtable(freq,floating=FALSE)
#print(table, type = "HTML")

Boxplots - friend_count

b1 <-
ggplot(data = filter(fb_df, !is.na(gender)), 
       aes(x = gender, y = friend_count)) +
  geom_boxplot() +
  do.call(chart_theme_minimal, as.list((chart_format_hist_no_vgrid["fsize"] = 14))) +
  scale_y_continuous(labels = comma) +
  xlab("")

print(b1)

#zoom to focus on users having 0-1000 friends
b1 +
  coord_cartesian(ylim = c(0, 1000))

#zoom further focus on users having 0-250 friends
b1 +
  coord_cartesian(ylim = c(0, 250))

#summary of friend_count by gender
by(fb_df$friend_count, fb_df$gender, summary)
## fb_df$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## -------------------------------------------------------- 
## fb_df$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917

Logical operators for flag variables

At times variables may have very sparse data e.g. # of mobile logins. In such cases, you may want to create a variable that just captures whether a given feature was ever used.

#Summary of mobile likes
summary(fb_df$mobile_likes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     4.0   106.1    46.0 25110.0
#You can observe that more than 25% of users have zero mobile likes
summary(fb_df$mobile_likes > 0)
##    Mode   FALSE    TRUE    NA's 
## logical   35056   63947       0
#More than 35% of users have no mobile likes

#mobile checkin flag variable creation
fb_df <- fb_df %>%
  mutate(mobile_checkin = NA) %>%
  mutate(mobile_checkin = ifelse(mobile_likes >0, 1, 0)) %>%
  mutate(mobile_checkin = factor(mobile_checkin))

summary(fb_df$mobile_checkin)
##     0     1 
## 35056 63947
#What % of people ever checked in using mobile
print(sum(fb_df$mobile_checkin == 1) / length(fb_df$mobile_checkin))
## [1] 0.6459097

Diamonds dataset EDA

Diamonds- Basic dataset info

dim(diamonds)
## [1] 53940    10
glimpse(diamonds)
## Observations: 53940
## Variables:
## $ carat   (dbl) 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, ...
## $ cut     (fctr) Ideal, Premium, Good, Premium, Good, Very Good, Very ...
## $ color   (fctr) E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J,...
## $ clarity (fctr) SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, S...
## $ depth   (dbl) 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, ...
## $ table   (dbl) 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54...
## $ price   (int) 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339,...
## $ x       (dbl) 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, ...
## $ y       (dbl) 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, ...
## $ z       (dbl) 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, ...

Diamonds- EDA

ggplot(data = diamonds, aes(x = price)) +
  geom_histogram(colour = "white", fill = primary_color[1], alpha = 0.4, size = 0.25) +
  geom_vline(xintercept=mean(diamonds$price), size=0.75, color = "grey") +
  annotate("text", x = mean(diamonds$price)*1.03, y = 15000, 
           label = "mean", hjust = 0) +
  geom_vline(xintercept=median(diamonds$price), size=0.75, color = "grey") +
  annotate("text", x = median(diamonds$price)*1.03, y = 12000, 
           label = "median", hjust = 0)

summary(diamonds$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2401    3933    5324   18820
#count by price points
diamonds %>%
  mutate(price_pnt = ifelse(price < 250, "<250", 
                            ifelse(price < 500, "<500",
                                   ifelse(price >= 15000, ">15k", "other")))) %>%
  group_by(price_pnt) %>%
  summarise(n = n())
## Source: local data frame [3 x 2]
## 
##   price_pnt     n
## 1      <500  1729
## 2      >15k  1656
## 3     other 50555

Facebook - Explore two variables

#scatterplot - relation between age and friend count
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
  set_ggplot_theme01 +
  geom_point(alpha = 1,
             color = primary_color[1])

#set alpha to 1/20 i.e 20 data points make one solid dot
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
  set_ggplot_theme01 +
  geom_point(alpha = 1/20,
             color = primary_color[1])

#limit the age from 13 to 90 yrs
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
  set_ggplot_theme01 +
  geom_point(alpha = 1/20,
             color = primary_color[1]) +
  xlim(13, 90)

#add jitter to avoid straight vertical lines
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
  set_ggplot_theme01 +
  geom_jitter(alpha = 1/20,
              color = primary_color[1]) +
  scale_x_continuous(limits = c(13, 90), breaks = seq(13, 90, 10))

#add square root Y axis to reduce right skew
#limit the age from 13 to 90 yrs
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
  set_ggplot_theme01 +
  geom_jitter(alpha = 1/20,
              position = position_jitter(h = 0),
              color = primary_color[1]) +
  scale_x_continuous(limits = c(13, 90), breaks = seq(13, 90, 10)) +
  coord_trans(y = "sqrt")

The square root, x to x^(1/2) = sqrt(x), is a transformation with a moderate effect on distribution shape: it is weaker than the logarithm and the cube root. It is also used for reducing right skewness, and also has the advantage that it can be applied to zero values. Note that the square root of an area has the units of a length. It is commonly applied to counted data, especially if the values are mostly rather small.

If we add noise to zero we could end up with -ve numbers where sqrt will be imaginary. Set the position parameter equal to position_jitter and pass it a min height of zero.

Summary metrics on charts

ggplot(data = fb_df, aes(x = age, y = friend_count)) +
  set_ggplot_theme01 +
  geom_jitter(alpha = 1/20,
              position = position_jitter(h = 0),
              color = "light grey") +
  scale_x_continuous(limits = c(13, 90), breaks = seq(13, 90, 10)) +
  coord_trans(y = "sqrt") +
  geom_line(aes(color = "Mean"), stat = "summary", fun.y = mean) +
  geom_line(aes(color = "10% Quantile"), stat = "summary", fun.y = quantile, 
            probs = 0.1, linetype = 2) +
  geom_line(aes(color = "90% Quantile"), stat = "summary", fun.y = quantile, 
            probs = 0.9, linetype = 2) +
  #scale_color_manual(values=primary_color[1]) +
  #annotate("text", x=max(fb_df$age)+1, y = mean(fb_df$friend_count), label = "mean", hjust = 0) +
  theme(legend.position = "right") +
  guides(colour = guide_legend(override.aes = list(size=1)))

#zoom in using coord-cartesian 
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
  set_ggplot_theme01 +
  coord_cartesian(xlim = c(13, 70), ylim = c(0, 1000)) +
  geom_jitter(alpha = 1/20,
              position = position_jitter(h = 0),
              color = "light grey") +
  #scale_x_continuous(limits = c(13, 90), breaks = seq(13, 90, 10)) +
  #coord_trans(y = "sqrt") +
  geom_line(aes(color = "Mean"), stat = "summary", fun.y = mean) +
  geom_line(aes(color = "10% Quantile"), stat = "summary", fun.y = quantile, 
            probs = 0.1, linetype = 2) +
  geom_line(aes(color = "90% Quantile"), stat = "summary", fun.y = quantile, 
            probs = 0.9, linetype = 2) +
  #scale_color_manual(values=primary_color[1]) +
  #annotate("text", x=max(fb_df$age)+1, y = mean(fb_df$friend_count), label = "mean", hjust = 0) +
  theme(legend.position = "right") +
  guides(colour = guide_legend(override.aes = list(size=1)))

###Correlation A correlation of >0.3 is mild, >0.5 is moderate and >0.7 is pretty strong

cor.test(fb_df$age, fb_df$friend_count, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  fb_df$age and fb_df$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
#alternatively for less verbose code
with(fb_df, cor.test(age, friend_count, method = "pearson"))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
with(filter(fb_df, age<=70), cor.test(age, friend_count, method = "pearson"))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.5923, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245
#spearman coeff for monotonic relationships
with(filter(fb_df, age<=70), cor.test(age, friend_count, method = "spearman"))
## 
##  Spearman's rank correlation rho
## 
## data:  age and friend_count
## S = 1.5782e+14, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.2552934

Strong correlations

engagement variables all measure essentiall the same thing - engagement. e.g. # of photo uploads, no. of posts etc..

ggplot(data = fb_df, aes(x = www_likes_received, y = likes_received)) +
  set_ggplot_theme01 +
  geom_point(position = position_jitter(h=0),
             color = "light grey",
             alpha = 1/20) +
  xlim(0, quantile(fb_df$www_likes_received, 0.95)) +
  ylim(0, quantile(fb_df$likes_received, 0.95)) +
  geom_smooth(method = "lm", color = primary_color[1], size = 2) +
  annotate("text", x = Inf, y = -Inf, label = "sumitbajaj.me",
           hjust=1.1, vjust=-.5, col="gray", cex=4, alpha = 0.8)

#correlation coeff
cor.test(fb_df$www_likes_received, fb_df$likes_received, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  fb_df$www_likes_received and fb_df$likes_received
## t = 937.1035, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902

More caution with correlation

#load the mitchel dataset for soil temp study
#install.packages("alr3")
library(alr3)
glimpse(Mitchell)
## Observations: 204
## Variables:
## $ Month (int) 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ Temp  (dbl) -5.18333, -1.65000, 2.49444, 10.40000, 14.99440, 21.7167...
#Range of the month variable
range(Mitchell$Month)
## [1]   0 203
ggplot(data = Mitchell, aes(x = Month, y = Temp)) +
  chart_theme01 +
  geom_point() +
  chart_footer+
  scale_x_continuous(breaks = seq(0, 203, 12))  #increments of 12 months

with(Mitchell, cor.test(Month, Temp, method = "pearson"))
## 
##  Pearson's product-moment correlation
## 
## data:  Month and Temp
## t = 0.8182, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063
#non parametric test
with(Mitchell, dcor.ttest(Month, Temp))
## 
##  dcor t-test of independence
## 
## data:  Month and Temp
## T = -0.939, df = 20501, p-value = 0.8261
## sample estimates:
## Bias corrected dcor 
##        -0.006558215
#notice the cyclical patterns with line chart and aspect ratio using coord_fixed (y/x)
ggplot(data = Mitchell, aes(x = Month, y = Temp)) +
  chart_theme01 +
  geom_point() +
  geom_line(color = color_primary_pal[1], alpha = 0.3, size = 2) +
  coord_fixed(ratio = 1) +
  chart_footer +
  scale_x_continuous(breaks = seq(0, 203, 12))  #increments of 12 months

#to compare monthly trends
ggplot(data = Mitchell, aes(x = Month%%12, y = Temp)) +
  chart_theme01 + 
  geom_point(color = color_primary_pal[1], alpha = 0.5, size = 4) +
  scale_x_continuous(breaks = seq(0, 13, 1)) +
  chart_footer 

###Noise and smootheming

pf.fc_by_age <- fb_df %>%
  filter(age < 71) %>%
  group_by(age) %>%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median = median(friend_count),
            n = n())

#plot mean friend count against age (yrs)
p1 <-
ggplot(data = pf.fc_by_age, aes(x = age, y = friend_count_mean)) +
  chart_theme01 +
  scale_y_continuous(limit = c(0,400)) +
  scale_x_continuous(limit = c(0, 72)) +
  geom_line(color = color_primary_pal[1], alpha = 1, size = 1) +
  geom_smooth(size = 0.1, fill = grey(0.9)) +
  chart_footer

pf.fc_by_age[15:20,]
## Source: local data frame [6 x 4]
## 
##   age friend_count_mean friend_count_median    n
## 1  27          134.1473                72.0 2240
## 2  28          125.8354                66.0 2364
## 3  29          120.8182                66.0 1936
## 4  30          115.2080                67.5 1716
## 5  31          118.4599                63.0 1694
## 6  32          114.2800                63.0 1443
#mean friend count by age (year.month) to generate an even noisier plot
pf.fc_by_age_months <- fb_df %>%
  filter(age < 71) %>%
  mutate(age_with_months = age + (12 - dob_month)/12) %>%
  group_by(age_with_months) %>%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median = median(friend_count),
            n = n())


#plot mean friend count against age.month
p2 <-
ggplot(data = pf.fc_by_age_months, aes(x = age_with_months, y = friend_count_mean)) +
  chart_theme01 +
  scale_y_continuous(limit = c(0,400)) +
  scale_x_continuous(limit = c(0, 72)) +
  geom_line(color = color_primary_pal[2], alpha = 1, size = 0.25) +
  geom_smooth(size = 0.1, fill = grey(0.9)) +
  chart_footer

#Loess regression to smooth the mean friend count
p3 <- 
  fb_df %>%
  filter(age < 71) %>%
  ggplot(aes(x = round(age/5)*5, y = friend_count)) +
  chart_theme01 +
  geom_line(stat = "summary", fun.y = mean,
            color = color_primary_pal[3], alpha = 1, size = 1) +
  scale_x_continuous(limit = c(0, 72)) +
  coord_cartesian(ylim = c(0, 400)) +
  geom_smooth(size = 0.1, fill = grey(0.9)) +
  chart_footer
  
grid.arrange(p2, p1, p3, ncol = 1)

More than two variables for EDA

Third qualitative variable

ggplot(data = subset(fb_df, !is.na(gender)), aes(x = age, y = friend_count)) +
  chart_theme01 +
  geom_line(stat = "summary", fun.y = mean,
            aes(color = gender)) +
  #colScalegender +
  scale_colour_brewer(palette="Set1") +
  legend_top +
  legend_title_hide +
  legend_size_override() +
  chart_footer +
  ggtitle("Mean friend count by age and gender")

#mean friend count by age>genger
pf.fc_by_age_gender <-
  fb_df %>%
  filter(!is.na(gender)) %>%
  group_by(age, gender) %>%
  summarise(mean_friend_count = mean(friend_count),
                          median_friend_count = median(as.numeric(friend_count)),
                          n = n())

head(pf.fc_by_age_gender, 4)
## Source: local data frame [4 x 5]
## Groups: age
## 
##   age gender mean_friend_count median_friend_count    n
## 1  13 female          259.1606               148.0  193
## 2  13   male          102.1340                55.0  291
## 3  14 female          362.4286               224.0  847
## 4  14   male          164.1456                92.5 1078
#median friend count by age for each gender
ggplot(data = pf.fc_by_age_gender, aes(x = age, y = median_friend_count)) +
  chart_theme01 +
  geom_line(aes(color = gender)) +
  #colScalegender +
  scale_colour_brewer(palette="Set1") +
  legend_top +
  legend_title_hide +
  legend_size_override() +
  chart_footer +
  ggtitle("Median friend count by age and gender")

Reshape from long to wide format

#spread to wide format key-value
pf.fc_by_age_gender_wide <- pf.fc_by_age_gender %>%
  select(1, 2, 4) %>%
  spread(gender, median_friend_count) %>%
  mutate(ratio = male/female)

head(pf.fc_by_age_gender_wide, 3)
## Source: local data frame [3 x 4]
## 
##   age female  male     ratio
## 1  13    148  55.0 0.3716216
## 2  14    224  92.5 0.4129464
## 3  15    276 106.5 0.3858696
#plot ratio of median friends female / male
ggplot(data = pf.fc_by_age_gender_wide, aes(x = age, y = female/male)) +
  chart_theme01 +
  geom_line(color = color_primary_pal[2], alpha = 1, size = 0.75) +
  coord_cartesian(ylim = c(0, 3)) +
  geom_hline(yintercept =1, linetype = 2, size = 1, color = color_highlight_pal[1]) +
  chart_footer

#note: one possible reason. More growth from newer countries. Initial users more likely to be male.

Add bucketed variables like year joined using cut

#create variable for year joined fb
fb_df$year_joined = 2014 - ceiling(fb_df$tenure/365)

summary(fb_df$year_joined)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2005    2012    2012    2012    2013    2014       2
#looks like most people joined in last 2 years

table(fb_df$year_joined)
## 
##  2005  2006  2007  2008  2009  2010  2011  2012  2013  2014 
##     9    15   581  1507  4557  5448  9860 33366 43588    70
#create bucketed variable for year joined
fb_df <- fb_df %>%
  mutate(year_joined_bucket = cut(year_joined, c(2004, 2009, 2011, 2012, 2014)))

table(fb_df$year_joined_bucket)
## 
## (2004,2009] (2009,2011] (2011,2012] (2012,2014] 
##        6669       15308       33366       43658
#plot each year_join bucket for median friend count against age
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
  chart_theme01 +
  chart_footer +
  geom_line(aes(color = year_joined_bucket),
            stat = "summary", fun.y = median) +
  scale_colour_brewer(palette="Set1") +
  legend_top +
  legend_size_override()

#Plot the grand mean as y intercept
#plot each year_join bucket for median friend count against age
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
  chart_theme01 +
  chart_footer +
  geom_line(aes(color = year_joined_bucket),
            stat = "summary", fun.y = mean) +
  scale_colour_brewer(palette="Set1") +
  geom_line(stat = "summary", fun.y = mean, linetype = 2, aes(color = "Grand Mean")) +
  annotate("text", x = 15, y = mean(fb_df$friend_count)*1.2, label = "Grand Mean") +
  legend_top +
  legend_size_override()

#Friendships initiated by tenure
ggplot(data = filter(fb_df, !is.na(tenure), !is.na(friendships_initiated), tenure >1, friendships_initiated >1), 
       aes(x = tenure, y = friendships_initiated/tenure)) +
  geom_line(aes(color = year_joined_bucket),
            stat = "summary", fun.y = mean) +
  chart_theme01 +
  chart_footer +
  legend_top + legend_size_override() 

#It appears that users with higher tenure initiate less friendships  

#Smoothening the tenure - bias variance tradeoff. as we increase the binwidth, the variance goes down and the bias goes up
ggplot(data = filter(fb_df, !is.na(tenure), !is.na(friendships_initiated), tenure >1, friendships_initiated >1), 
       aes(x = round(tenure/7)*7, y = friendships_initiated/tenure)) +
  geom_line(aes(color = year_joined_bucket),
            stat = "summary", fun.y = mean) +
  chart_theme01

ggplot(data = filter(fb_df, !is.na(tenure), !is.na(friendships_initiated), tenure >1, friendships_initiated >1), 
       aes(x = round(tenure/30)*30, y = friendships_initiated/tenure)) +
  geom_line(aes(color = year_joined_bucket),
            stat = "summary", fun.y = mean) +
  chart_theme01

ggplot(data = filter(fb_df, !is.na(tenure), !is.na(friendships_initiated), tenure >1, friendships_initiated >1), 
       aes(x = round(tenure/90)*90, y = friendships_initiated/tenure)) +
  geom_line(aes(color = year_joined_bucket),
            stat = "summary", fun.y = mean, , size = 1) +
  chart_theme01

#using geom_smooth 
ggplot(data = filter(fb_df, tenure >=1), 
       aes(x = tenure, y = friendships_initiated/tenure)) +
  geom_smooth(aes(color = year_joined_bucket)) +
  chart_theme01 +
  scale_x_continuous(labels = comma)

Yogurt purchase data EDA

yo <- read.csv("../data/yogurt.csv", header = T, stringsAsFactors = TRUE, sep = ",")
head(yo)
##   obs      id  time strawberry blueberry pina.colada plain mixed.berry
## 1   1 2100081  9678          0         0           0     0           1
## 2   2 2100081  9697          0         0           0     0           1
## 3   3 2100081  9825          0         0           0     0           1
## 4   4 2100081  9999          0         0           0     0           1
## 5   5 2100081 10015          1         0           1     0           1
## 6   6 2100081 10029          1         0           2     0           1
##   price
## 1 58.96
## 2 58.96
## 3 65.04
## 4 65.04
## 5 48.96
## 6 65.04
str(yo)
## 'data.frame':    2380 obs. of  9 variables:
##  $ obs        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ id         : int  2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
##  $ time       : int  9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
##  $ strawberry : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ blueberry  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pina.colada: int  0 0 0 0 1 2 0 0 0 0 ...
##  $ plain      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mixed.berry: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ price      : num  59 59 65 65 49 ...
#change id to factor variable from int
yo$id <- as.factor(yo$id)

#Price histogram
ggplot(data = yo, aes(x = price)) +
  chart_theme01 +
  geom_histogram(binwidth = 1, fill = color_primary_pal[1])

#the price histogram indicates discreteness

length(unique(yo$price))
## [1] 20
#only 20 unique prices

#new variable for total # of purchases in a transaction
yo <- yo %>%
  mutate(all.purchases = strawberry + blueberry + pina.colada   + plain + mixed.berry)
summary(yo$all.purchases)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   1.971   2.000  21.000
#histogram of all.purchases
ggplot(data = yo, aes(x = all.purchases))+
  chart_theme01 +
  geom_histogram(fill = color_primary_pal[1], binwidth = 1)

#scatterplot of price vs time
ggplot(data = yo, aes(x = time, y = price))+
  chart_theme01 +
  geom_point(color = color_primary_pal[1], alpha = 0.5)

#the most common prices seem to be increasing over time
#the scattered lower prices could be due to markdowns or usage of coupons

looking at a sample of yogurt households to understand deeper

#set the seed for reproducible samples
set.seed(4230)

#draw 16 households
sample.ids <- sample(levels(yo$id), 16)

#scatterplot of price vs time
ggplot(data = filter(yo, id %in% sample.ids), aes(x = time, y = price))+
  chart_theme01 +
  geom_point(color = color_primary_pal[1], alpha = 0.5, aes(size = all.purchases)) +
  geom_line(color = color_highlight_pal[1], size = .5, alpha = 0.8) +
  legend_top +
  facet_wrap(~id) +
  ggtitle("# purchases and price trend over time for 16 random households")

Scatterplot matrices

#set the seed for reproducible samples
set.seed(1836)

#extract columns 2:5 from fb dataframe
pf_subset <- fb_df[, c(2:5)]

ggpairs(pf_subset[sample.int(nrow(pf_subset), 100), ],
         params=list(corSize=4, fontsize = 4))